################################################################################ 
#
# The distribution of hate speech and its implications for content moderation
# PSRM - Replication package
# Table C1
#
################################################################################ 


library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(forcats)

rm(list = setdiff(ls(), ls(pattern = "^wd|^setsave$")))

# load anonymized data
aggregated_data <- readRDS(paste0(wd_data, "/twitter_user_info/us_aggregated_by_user.rds"))

# Step 1: Prepare data
aggregated_data <- aggregated_data %>%
  mutate(log_followers = log10(followers_count + 1),
         log_following = log10(following_count + 1),
         log_tweet_count = log10(tweet_count + 1),
         country_given = ifelse(is.na(places_country) != T, T, F)) %>%
  filter(!is.na(ever_hatespeech)) %>%
  mutate(anonymity_bool = ifelse(anonymity >= -1, 0, 1))

# Step 2: Define variables of interest
binary_vars <- c("verified", "anonymity_bool", "country_given","name_matching")
continuous_vars <- c("log_followers", "log_following", "log_tweet_count", "listed_count", "account_age_days","anonymity","entropy")


### ===== 1. Create Summary Table for ever_hatespeech =====
# Function for binary vars (proportion test)
binary_results <- lapply(binary_vars, function(var) {
  tab <- table(aggregated_data[[var]], aggregated_data$ever_hatespeech)
  # Proportions
  p_true <- mean(aggregated_data[[var]][aggregated_data$ever_hatespeech == TRUE], na.rm = TRUE)
  p_false <- mean(aggregated_data[[var]][aggregated_data$ever_hatespeech == FALSE], na.rm = TRUE)
  # Standard deviations (for completeness)
  sd_true <- sd(aggregated_data[[var]][aggregated_data$ever_hatespeech == TRUE], na.rm = TRUE)
  sd_false <- sd(aggregated_data[[var]][aggregated_data$ever_hatespeech == FALSE], na.rm = TRUE)
  # Chi-squared or Fisher test
  test_result <- tryCatch({
    if (any(tab < 5)) {
      fisher.test(tab)
    } else {
      chisq.test(tab)
    }
  }, error = function(e) list(p.value = NA))
  
  tibble(
    variable = var,
    mean_TRUE = p_true,
    sd_TRUE = sd_true,
    mean_FALSE = p_false,
    sd_FALSE = sd_false,
    mean_diff = p_true - p_false,
    p_value = test_result$p.value
  )
})

# Function for continuous vars (t-test)
continuous_results <- lapply(continuous_vars, function(var) {
  group_stats <- aggregated_data %>%
    group_by(ever_hatespeech) %>%
    summarise(
      mean = mean(.data[[var]], na.rm = TRUE),
      sd = sd(.data[[var]], na.rm = TRUE),
      .groups = "drop"
    )
  
  mean_true <- group_stats$mean[group_stats$ever_hatespeech == TRUE]
  mean_false <- group_stats$mean[group_stats$ever_hatespeech == FALSE]
  sd_true <- group_stats$sd[group_stats$ever_hatespeech == TRUE]
  sd_false <- group_stats$sd[group_stats$ever_hatespeech == FALSE]
  
  values_true <- aggregated_data[[var]][aggregated_data$ever_hatespeech == TRUE]
  values_false <- aggregated_data[[var]][aggregated_data$ever_hatespeech == FALSE]
  
  t_test <- tryCatch(
    t.test(values_true, values_false),
    error = function(e) list(p.value = NA)
  )
  
  tibble(
    variable = var,
    mean_TRUE = mean_true,
    sd_TRUE = sd_true,
    mean_FALSE = mean_false,
    sd_FALSE = sd_false,
    mean_diff = mean_true - mean_false,
    p_value = t_test$p.value
  )
})

# Combine and display
summary_table_ever <- bind_rows(binary_results, continuous_results) %>%
  mutate(across(where(is.numeric), ~ round(.x, 3)))

summary_table_ever


### ===== 2. Create Summary Table for median_hatespeech =====
# For Binary Variables (median_hatespeech)
binary_results_median <- lapply(binary_vars, function(var) {
  tab <- table(aggregated_data[[var]], aggregated_data$median_hatespeech)
  
  p_below <- mean(aggregated_data[[var]][aggregated_data$median_hatespeech == "below median"], na.rm = TRUE)
  p_above <- mean(aggregated_data[[var]][aggregated_data$median_hatespeech == "above median"], na.rm = TRUE)
  
  sd_below <- sd(aggregated_data[[var]][aggregated_data$median_hatespeech == "below median"], na.rm = TRUE)
  sd_above <- sd(aggregated_data[[var]][aggregated_data$median_hatespeech == "above median"], na.rm = TRUE)
  
  test_result <- tryCatch({
    if (any(tab < 5)) {
      fisher.test(tab)
    } else {
      chisq.test(tab)
    }
  }, error = function(e) list(p.value = NA))
  
  tibble(
    variable         = var,
    Below_mean       = p_below,
    Below_sd         = sd_below,
    Above_mean       = p_above,
    Above_sd         = sd_above,
    Median_mean_diff = p_below - p_above,
    Median_p_value   = test_result$p.value
  )
})

# For Continuous Variables (median_hatespeech)
continuous_results_median <- lapply(continuous_vars, function(var) {
  group_stats <- aggregated_data %>%
    group_by(median_hatespeech) %>%
    summarise(
      mean = mean(.data[[var]], na.rm = TRUE),
      sd   = sd(.data[[var]], na.rm = TRUE),
      .groups = "drop"
    )
  
  mean_below <- group_stats$mean[group_stats$median_hatespeech == "below median"]
  mean_above <- group_stats$mean[group_stats$median_hatespeech == "above median"]
  sd_below   <- group_stats$sd[group_stats$median_hatespeech == "below median"]
  sd_above   <- group_stats$sd[group_stats$median_hatespeech == "above median"]
  
  t_test <- tryCatch(
    t.test(aggregated_data[[var]][aggregated_data$median_hatespeech == "below median"],
           aggregated_data[[var]][aggregated_data$median_hatespeech == "above median"]),
    error = function(e) list(p.value = NA)
  )
  
  tibble(
    variable         = var,
    Below_mean       = mean_below,
    Below_sd         = sd_below,
    Above_mean       = mean_above,
    Above_sd         = sd_above,
    Median_mean_diff = mean_below - mean_above,
    Median_p_value   = t_test$p.value
  )
})

summary_table_median <- bind_rows(binary_results_median, continuous_results_median) %>%
  mutate(across(where(is.numeric), ~ round(.x, 3)))


final_summary_table <- full_join(summary_table_ever, summary_table_median, by = "variable")



# Display final table
final_summary_table

# Step 6: Rename variable names
final_summary_table <- final_summary_table %>%
  mutate(variable = recode(variable,
                           verified = "Verified Account Share",
                           anonymity_bool = "Anonymous Accounts Share",
                           country_given = "Profile contains a Country",
                           log_followers = "log(Followers Count)",
                           log_following = "log(Friends Count)",
                           log_tweet_count = "log(Status Count)",
                           account_age_days = "Account Age (days)",
                           listed_count = "No. of lists user is a member of",
                           anonymity = "Personal Identification Score",
                           entropy = "User Name Entropy Score",
                           name_matching = "User Name is a Real Name"
  ))

# Step 7: Rename columns
final_summary_table <- final_summary_table %>%
  rename(
    `Mean (HS Users)` = mean_TRUE,
    `SD (HS User)` = sd_TRUE,
    `Mean (NO HS User)` = mean_FALSE,
    `SD (No HS User)` = sd_FALSE,
    `Mean Difference` = mean_diff,
    `p-value of difference` = p_value
  )

# Step 8: Save to CSV
write.csv(final_summary_table, paste0(wd_res, "/tables/tabC1.csv"), row.names = FALSE)
cat("\n====================\n")
cat("Saved Table C1")
cat("\n====================\n")









#-- 1) Choose variables of interest. For demonstration, we'll only do continuous:
vars_for_plot <- c("verified", "anonymity_bool", "country_given", "name_matching",
                   "log_followers","log_following","log_tweet_count",
                   "listed_count","account_age_days","anonymity","entropy")

#-- 2) Subset and standardize
plot_data <- aggregated_data %>%
  select(user_id, quartile_hatespeech, all_of(vars_for_plot)) %>%
  filter(!is.na(quartile_hatespeech)) %>%
  mutate(across(all_of(vars_for_plot), ~ scale(.) %>% as.numeric()))

#-- 3) Reshape from wide to long
plot_data_long <- plot_data %>%
  pivot_longer(cols = all_of(vars_for_plot),
               names_to = "variable",
               values_to = "value")

#-- 4) Calculate means and standard errors by quartile and variable
summaries <- plot_data_long %>%
  group_by(variable, quartile_hatespeech) %>%
  summarise(
    mean_value = mean(value, na.rm = TRUE),
    se_value   = sd(value, na.rm = TRUE) / sqrt(sum(!is.na(value))),
    .groups    = "drop"
  )

summaries <- summaries %>% mutate(variable = recode(variable,
                                                    verified = "Verified Account Share",
                                                    anonymity_bool = "Anonymous Accounts Share",
                                                    country_given = "Profile contains a Country",
                                                    log_followers = "log(Followers Count)",
                                                    log_following = "log(Friends Count)",
                                                    log_tweet_count = "log(Status Count)",
                                                    account_age_days = "Account Age (days)",
                                                    listed_count = "No. of lists user is a member of",
                                                    anonymity = "Personal Identification Score",
                                                    entropy = "User Name Entropy Score",
                                                    name_matching = "User Name is a Real Name"))

summaries <- summaries %>% filter(variable != "Personal Identification Score")

#-- 5) Plot: forest plot style
plot_1 <- ggplot(summaries, aes(x = mean_value,
                      y = fct_rev(factor(variable)),  # Reverse for top-down listing
                      color = factor(quartile_hatespeech),
                      shape = factor(quartile_hatespeech))) +
  geom_point(position = position_dodge(width = 0.6), size = 2) +
  geom_linerange(aes(xmin = mean_value - 1.96 * se_value,
                     xmax = mean_value + 1.96 * se_value),
                 position = position_dodge(width = 0.6)) +
  labs(x = "Mean of standardized variable",
       y = "",
       color = "Quartile of Hate Speech:",
       shape = "Quartile of Hate Speech:") +
  scale_color_manual(values = c("1" = "grey80",
                                "2" = "grey60",
                                "3" = "grey30",
                                "4" = "black")) +  
  scale_shape_manual(values = c(16, 17, 15, 18)) +
  scale_x_continuous(
    breaks = seq(
      floor(min(summaries$mean_value - 1.96 * summaries$se_value)),
      ceiling(max(summaries$mean_value + 1.96 * summaries$se_value)),
      by = 0.25)
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    panel.grid.minor = element_blank(),
    axis.text.y = element_text(face = "bold", size = 12),      
    axis.title = element_text(face = "bold", size = 12),
    legend.title = element_text(face = "bold", size = 12)     
  )

plot_1
saveRDS(plot_1, paste0(wd_data_processed, "/twitter_user_info_aggregated/hate_speech_quartiles_us_data.rds"))

